# package
library(ape)
## Warning: package 'ape' was built under R version 3.5.2
library(clusterProfiler)
## 
## clusterProfiler v3.10.1  For help: https://guangchuangyu.github.io/software/clusterProfiler
## 
## If you use clusterProfiler in published research, please cite:
## Guangchuang Yu, Li-Gen Wang, Yanyan Han, Qing-Yu He. clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS: A Journal of Integrative Biology. 2012, 16(5):284-287.
#bring in phase1 allele frequencies
phase1.all.freqs<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.all.freqs.csv")
#make column for id
phase1.all.freqs$id<-paste(phase1.all.freqs$chrom,phase1.all.freqs$POS)
#bring in dataframe with locality and environmental data for each individual phase1
phase1.alldata<-read.csv(file = "~/Desktop/anoph.3.march.2020/phase1.allvariables.csv")
#subset to only the variables we need (lat,long,hum,temp,precip)
meta<-phase1.alldata[,c(16,17,18,20,22)]
#subset only the unique lat/longs
metapops<-unique(meta)
#sort by latitude to match the order of the allele frequency files
sort.pops.phase1 <-metapops[order(metapops$latitude),]

## read in lfmm outlier table
lfmm.outlier.table <- read.csv("~/Downloads/vetted.lfmm.hum.env.csv")[,2:3]
#add separate chrom and pos columns
lfmm.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,1]
lfmm.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(lfmm.outlier.table$id),' ',fixed=TRUE)))[,2]

## read in bayescan outlier table
bayescan.outlier.table <- read.csv("~/Downloads/vetted.bayescenv.hum.env.csv")[,2:3]
#add separate chrom and pos columns
bayescan.outlier.table$chrom<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,1]
bayescan.outlier.table$pos<-data.frame(do.call('rbind', strsplit(as.character(bayescan.outlier.table$id),' ',fixed=TRUE)))[,2]

## read in genomic regions and attributes file
genomic_location <- read.gff("~/Downloads/Anopheles_gambiae.AgamP4.47.chr.gff3.gz")
#keep only regions identified as "gene"
gen_regions <- genomic_location[genomic_location$type == "gene", ] # subset to only gene regions

#pull all genes queried
all.gene.id<-data.frame(do.call('rbind', strsplit(as.character(gen_regions$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP004677", "biotype=protein_coding",
## "description=methylenetetrahydrofolate dehydrogenase(NAD+) / 5%2C10-
## methenyltetrahydrofolate [Source:VB Community Annotation]", : number of columns
## of result is not a multiple of vector length (arg 1)
all.genes<-data.frame(do.call('rbind', strsplit(as.character(all.gene.id),':',fixed=TRUE)))[,2]
#write.table(all.genes, "~/Downloads/all.genes.anopheles.gambiae.txt", row.names = F, col.names = F, quote = F)
#define chroms
chroms <- c("2R", "2L", "3R", "3L","X") # chromosomes of interest

#initialize gene.list for lfmm
lfmm.gene.list<-data.frame()
#fill gene.list by matching location of genes of interest in gene feature regions by chromosomes
for (i in 1:length(chroms)){
  genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
  pos <- as.numeric(as.character(lfmm.outlier.table[lfmm.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
  for (w in 1:length(pos)) {
    if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
    lfmm.gene.list<-rbind(lfmm.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
    }
  }
  cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
## 
## chromosome 2 of 5 finished
## 
## chromosome 3 of 5 finished
## 
## chromosome 4 of 5 finished
## 
## chromosome 5 of 5 finished
#pull out gene ID as its own column
lfmm.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP001881", "biotype=protein_coding",
## "description=protease m1 zinc metalloprotease [Source:VB Community
## Annotation]", : number of columns of result is not a multiple of vector length
## (arg 1)
lfmm.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(lfmm.gene.list$gene.id),':',fixed=TRUE)))[,2]

#make SNP ID column
lfmm.gene.list$SNPid<-paste(lfmm.gene.list$seqid,lfmm.gene.list$`pos[w]`)

#subset
lfmm.gene.list<-lfmm.gene.list[,c("SNPid","gene")]

#calculate percentage of lfmm candidate SNPs that were in a gene
nrow(lfmm.gene.list)/nrow(lfmm.outlier.table)
## [1] 0.5205939
table(lfmm.gene.list$gene)[table(lfmm.gene.list$gene) >10]
## 
## AGAP000519 AGAP002310 AGAP002336 AGAP005378 AGAP007086 AGAP007732 AGAP007736 
##         19         11         13         11         11         11         47 
## AGAP008467 AGAP009189 AGAP009200 AGAP009201 AGAP009213 AGAP010184 AGAP029480 
##         18         17         14         38         12         25         20 
## AGAP029511 
##         47
hist(table(lfmm.gene.list$gene))

#write.table(unique(lfmm.gene.list$gene), "~/Downloads/lfmm.hum.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(lfmm.gene.list, "~/Downloads/lfmm.hum.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#initialize gene.list for bayescan
bayescan.gene.list<-data.frame()
#fill gene.list
for (i in 1:length(chroms)){
  genloc <- gen_regions[gen_regions$seqid == chroms[i], ] # genome info per chromosome
  pos <- as.numeric(as.character(bayescan.outlier.table[bayescan.outlier.table$chrom == chroms[i],4])) # outlierFST position per chromosome
  for (w in 1:length(pos)) {
    if (nrow(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,]) !=0){
      bayescan.gene.list<-rbind(bayescan.gene.list, cbind(genloc[pos[w] >= genloc$start & pos[w] <= genloc$end,], pos[w]))
    }
  }
  cat("chromosome", i, "of", length(chroms), "finished\n\n")
}
## chromosome 1 of 5 finished
## 
## chromosome 2 of 5 finished
## 
## chromosome 3 of 5 finished
## 
## chromosome 4 of 5 finished
## 
## chromosome 5 of 5 finished
#pull out gene ID as its own column
bayescan.gene.list$gene.id<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$attributes),';',fixed=TRUE)))[,1]
## Warning in rbind(c("ID=gene:AGAP029581", "biotype=protein_coding",
## "gene_id=AGAP029581", : number of columns of result is not a multiple of vector
## length (arg 1)
bayescan.gene.list$gene<-data.frame(do.call('rbind', strsplit(as.character(bayescan.gene.list$gene.id),':',fixed=TRUE)))[,2]

#make SNP ID column
bayescan.gene.list$SNPid<-paste(bayescan.gene.list$seqid,bayescan.gene.list$`pos[w]`)

#subset
bayescan.gene.list<-bayescan.gene.list[,c("SNPid","gene")]

#calculate percentage of bayescan candidate SNPs that were in a gene
nrow(bayescan.gene.list)/nrow(bayescan.outlier.table)
## [1] 0.5978078
table(bayescan.gene.list$gene)[table(bayescan.gene.list$gene) >10]
## 
## AGAP002336 AGAP002581 AGAP002824 AGAP002858 AGAP002859 AGAP002881 AGAP002896 
##         14         13         17         38         49         16         17 
## AGAP003623 AGAP003658 AGAP003669 AGAP004775 AGAP013179 
##         13         42         11         27         15
hist(table(bayescan.gene.list$gene))

#write.table(unique(bayescan.gene.list$gene), "~/Downloads/bayescan.hum.gene.list.txt", row.names = F, col.names = F, quote = F)
#write.table(bayescan.gene.list, "~/Downloads/bayescan.hum.gene.snp.match.txt", row.names = F, col.names = T, quote = F)
#look into how many genes overlap between the methods
intersect(unique(lfmm.gene.list$gene),unique(bayescan.gene.list$gene))
##  [1] "AGAP002310" "AGAP002336" "AGAP002369" "AGAP002573" "AGAP002578"
##  [6] "AGAP002649" "AGAP002678" "AGAP002741" "AGAP002748" "AGAP002822"
## [11] "AGAP002826" "AGAP002858" "AGAP002859" "AGAP002879" "AGAP002883"
## [16] "AGAP002896" "AGAP003632" "AGAP003633" "AGAP004767" "AGAP009201"
## [21] "AGAP009210" "AGAP010817" "AGAP010820" "AGAP029564" "AGAP029616"
## [26] "AGAP000821" "AGAP000830" "AGAP000519"

LFMM overrep testing

#overrepresentation testing
#read in each GO term to gene map (col1=geneID,col2=GOterm)
mart.export<-read.table("~/Downloads/mart_export (3).txt", sep = "\t", header = T)

#calculate overrepresentation for lfmm hum
lfmm.hum.enriched<-enricher(gene = lfmm.gene.list$gene,
                 universe = all.genes,
                   pAdjustMethod = "fdr",
                 TERM2GENE= mart.export[,c(2,1)]
)
result<-lfmm.hum.enriched@result

lfmm.hum.enriched.sig<-result[result$p.adjust <= .05,]
lfmm.hum.enriched.sig
#write.table(lfmm.hum.enriched.sig[,c("ID","geneID")], file="~/Downloads/lfmm.hum.go.overrep.txt",
#            row.names = F, col.names = F, quote = F)

Bayescenv overrep testing

#calculate overrepresentation for bayescan hum
bayescan.hum.enriched<-enricher(gene = bayescan.gene.list$gene,
                            universe = all.genes,
                            pAdjustMethod = "fdr",
                            TERM2GENE= mart.export[,c(2,1)]
                            )
result<-bayescan.hum.enriched@result

#retain significantly enriched GO terms
bayescan.hum.enriched.sig<-result[result$p.adjust <= .05,]

#check out significantly overrepresented GO terms
bayescan.hum.enriched.sig
#get a vector of genes we found SNPs in for each enriched GO term
#lfmm
go.0055085<-unlist(strsplit(as.character(lfmm.hum.enriched.sig$geneID[lfmm.hum.enriched.sig$ID == "GO:0055085"]),'/'))
go.0004672<-unlist(strsplit(as.character(lfmm.hum.enriched.sig$geneID[lfmm.hum.enriched.sig$ID == "GO:0004672"]),'/'))
go.0010951<-unlist(strsplit(as.character(lfmm.hum.enriched.sig$geneID[lfmm.hum.enriched.sig$ID == "GO:0010951"]),'/'))
go.0005509<-unlist(strsplit(as.character(lfmm.hum.enriched.sig$geneID[lfmm.hum.enriched.sig$ID == "GO:0005509"]),'/'))
go.0006468<-unlist(strsplit(as.character(lfmm.hum.enriched.sig$geneID[lfmm.hum.enriched.sig$ID == "GO:0006468"]),'/'))
#bayescan
go.0004930<-unlist(strsplit(as.character(bayescan.hum.enriched.sig$geneID[bayescan.hum.enriched.sig$ID == "GO:0004930"]),'/'))
go.0007186<-unlist(strsplit(as.character(bayescan.hum.enriched.sig$geneID[bayescan.hum.enriched.sig$ID == "GO:0007186"]),'/'))
go.0006368<-unlist(strsplit(as.character(bayescan.hum.enriched.sig$geneID[bayescan.hum.enriched.sig$ID == "GO:0006368"]),'/'))
#take them 1 by 1
#GO:0055085 = transmembrane transport
#The process in which a solute is transported across a lipid bilayer, from one side of a membrane to the other.
go.0055085
##  [1] "AGAP001966" "AGAP002331" "AGAP002369" "AGAP002578" "AGAP002826"
##  [6] "AGAP002859" "AGAP001634" "AGAP006347" "AGAP007086" "AGAP009123"
## [11] "AGAP007732" "AGAP009188" "AGAP007733" "AGAP009835" "AGAP010269"
## [16] "AGAP029240" "AGAP010794" "AGAP010854" "AGAP029297" "AGAP010383"
## [21] "AGAP029048" "AGAP012154" "AGAP010562" "AGAP000795"
#make subset list of SNPs and genes associated with this specific GO term
lfmm.gene.list.go.0055085<-lfmm.gene.list[lfmm.gene.list$gene %in% go.0055085,]
#This is a set of genes and SNPs found all over the genome
table(lfmm.gene.list[lfmm.gene.list$gene %in% go.0055085,]$gene)[table(lfmm.gene.list[lfmm.gene.list$gene %in% go.0055085,]$gene) >0]
## 
## AGAP000795 AGAP001634 AGAP001966 AGAP002331 AGAP002369 AGAP002578 AGAP002826 
##          3          1          1          2          1          5          1 
## AGAP002859 AGAP006347 AGAP007086 AGAP007732 AGAP007733 AGAP009123 AGAP009188 
##          4          3         11         11          3          1          1 
## AGAP009835 AGAP010269 AGAP010383 AGAP010562 AGAP010794 AGAP010854 AGAP012154 
##          1          2          3          2          1          2          1 
## AGAP029048 AGAP029240 AGAP029297 
##          1          1          1
#a few genes have many SNPs and some have only 1

plot all SNPs associated w/ GO:055085

#plot all SNPs associated w/ GO:0055085
sig.pos<-lfmm.gene.list.go.0055085$SNPid
gene<-droplevels(lfmm.gene.list.go.0055085$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0004672

lfmm.gene.list.go.0004672<-lfmm.gene.list[lfmm.gene.list$gene %in% go.0004672,]
#plot all SNPs associated w/ GO:0004672
sig.pos<-lfmm.gene.list.go.0004672$SNPid
gene<-droplevels(lfmm.gene.list.go.0004672$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0010951

lfmm.gene.list.go.0010951<-lfmm.gene.list[lfmm.gene.list$gene %in% go.0010951,]
#plot all SNPs associated w/ GO:0010951
sig.pos<-lfmm.gene.list.go.0010951$SNPid
gene<-droplevels(lfmm.gene.list.go.0010951$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0005509

lfmm.gene.list.go.0005509<-lfmm.gene.list[lfmm.gene.list$gene %in% go.0005509,]
#plot all SNPs associated w/ GO:0005509
sig.pos<-lfmm.gene.list.go.0005509$SNPid
gene<-droplevels(lfmm.gene.list.go.0005509$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0006468

lfmm.gene.list.go.0006468<-lfmm.gene.list[lfmm.gene.list$gene %in% go.0006468,]
#plot all SNPs associated w/ GO:0006468
sig.pos<-lfmm.gene.list.go.0006468$SNPid
gene<-droplevels(lfmm.gene.list.go.0006468$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0004930

bayescan.gene.list.go.0004930<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0004930,]
#plot all SNPs associated w/ GO:0004930
sig.pos<-bayescan.gene.list.go.0004930$SNPid
gene<-droplevels(bayescan.gene.list.go.0004930$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0007186

bayescan.gene.list.go.0007186<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0007186,]
#plot all SNPs associated w/ GO:0007186
sig.pos<-bayescan.gene.list.go.0007186$SNPid
gene<-droplevels(bayescan.gene.list.go.0007186$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

plot all SNPs associated w/ GO:0006368

bayescan.gene.list.go.0006368<-bayescan.gene.list[bayescan.gene.list$gene %in% go.0006368,]
#plot all SNPs associated w/ GO:0006368
sig.pos<-bayescan.gene.list.go.0006368$SNPid
gene<-droplevels(bayescan.gene.list.go.0006368$gene)
#plot allele freqs vs humidity for these SNPs
par(mfrow=c(3,3))
for (i in 1:length(sig.pos)){
  frq<-as.numeric(as.vector(phase1.all.freqs[phase1.all.freqs$id == sig.pos[i],3:16]))
  plot(sort.pops.phase1$hannual,frq, main = gene[i], ylim = c(0,1), xlab=sig.pos[i])
  abline(lm(frq~sort.pops.phase1$hannual))
}

Best places to find info:

uniprot database:

uniprot.org/

UC Irvine AGAM Gene Expression Profile

http://www.angagepuci.bio.uci.edu/

AG1000G selection atlas

https://malariagen.github.io/agam-selection-atlas/0.1-alpha3/index.html